/*
 * Copyright (c) 2021 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#if defined(__aarch64__)

#include "arm_gemm.hpp"
#include <cstddef>
#include <cstdint>

namespace arm_conv {
namespace depthwise {

void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
  const int8_t *const *const inptrs,
  int8_t *const *const outptrs,
  const int8_t *weights,
  const int32_t *bias,
  const unsigned int kernel_points,
  const unsigned int n_output_channels,
  const int32_t *per_channel_left_shifts,
  const int32_t *per_channel_muls,
  const int32_t *per_channel_right_shifts,
  const arm_gemm::Requantize32& qp
)
{
  __asm__ __volatile__(
    "mov x9, #0x0\n"
    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
    "ld1r { v14.4s }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
    "ld1r { v13.4s }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
    "ld1r { v12.16b }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
    "ld1r { v11.16b }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
    "ld1r { v10.4s }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
    "ld1r { v9.4s }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
    "ld1r { v8.4s }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
    "ld1r { v7.4s }, [x19]\n"
    "lsr x28, %x[n_output_channels], #0x2\n"
    "cbz x28, 9f\n"
    "1:"  // Output channel loop
    "movi v16.4s, #0x0\n"
    "cbz %x[bias], 2f\n"
    "lsl x19, x9, #0x2\n"
    "ldr q16, [%x[bias], x19]\n"
    "2:"  // Output channel loop: Load bias: Done
    "mov v6.16b, v16.16b\n"
    "mov v5.16b, v16.16b\n"
    "mov v4.16b, v16.16b\n"
    "mov v31.16b, v16.16b\n"
    "mov v30.16b, v16.16b\n"
    "mov v29.16b, v16.16b\n"
    "mov v28.16b, v16.16b\n"
    "mov v27.16b, v16.16b\n"
    "mov v26.16b, v16.16b\n"
    "mov v25.16b, v16.16b\n"
    "mov v24.16b, v16.16b\n"
    "mov v23.16b, v16.16b\n"
    "mov v22.16b, v16.16b\n"
    "mov v21.16b, v16.16b\n"
    "mov v20.16b, v16.16b\n"
    "mov v19.16b, v16.16b\n"
    "cbz %x[rq_mul_ptr], 3f\n"
    "lsl x19, x9, #0x2\n"
    "ldr q8, [%x[rq_mul_ptr], x19]\n"
    "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
    "cbz %x[rq_left_shift_ptr], 3f\n"
    "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
    "3:"  // Output channel loop: Load quantization parameters: Done
    "ldr s17, [%x[weights]], #0x4\n"
    "ssubl v17.8h, v17.8b, v11.8b\n"
    "mov x19, %x[inptrs]\n"
    "ldp x25, x27, [x19], #0x10\n"
    "lsr x20, %x[kernel_points], #0x1\n"
    "ldr d3, [x25, #0x0]\n"
    "ssubl v3.8h, v3.8b, v12.8b\n"
    "ldr d2, [x27, #0x0]\n"
    "ssubl v2.8h, v2.8b, v12.8b\n"
    "cbz x20, 7f\n"
    "ldp x25, x27, [x19], #0x10\n"
    "ldr s16, [%x[weights]], #0x4\n"
    "ssubl v16.8h, v16.8b, v11.8b\n"
    "ldr d1, [x25, #0x0]\n"
    "subs x20, x20, #0x1\n"
    "ssubl v1.8h, v1.8b, v12.8b\n"
    "ldr d0, [x27, #0x0]\n"
    "ssubl v0.8h, v0.8b, v12.8b\n"
    "beq 5f\n"
    "4:"  // Output channel loop: Kernel loop
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "ldp x25, x27, [x19], #0x10\n"
    "subs x20, x20, #0x1\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "ldr d3, [x25, #0x0]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "ldr d2, [x27, #0x0]\n"
    "ssubl v3.8h, v3.8b, v12.8b\n"
    "ldr s17, [%x[weights]], #0x4\n"
    "smlal v6.4s, v16.4h, v1.h[0]\n"
    "ldp x25, x27, [x19], #0x10\n"
    "smlal v5.4s, v16.4h, v1.h[1]\n"
    "smlal v4.4s, v16.4h, v1.h[2]\n"
    "ssubl v2.8h, v2.8b, v12.8b\n"
    "ssubl v17.8h, v17.8b, v11.8b\n"
    "smlal v31.4s, v16.4h, v1.h[3]\n"
    "smlal v30.4s, v16.4h, v1.h[4]\n"
    "smlal v29.4s, v16.4h, v1.h[5]\n"
    "smlal v28.4s, v16.4h, v1.h[6]\n"
    "smlal v27.4s, v16.4h, v1.h[7]\n"
    "ldr d1, [x25, #0x0]\n"
    "smlal v26.4s, v16.4h, v0.h[0]\n"
    "smlal v25.4s, v16.4h, v0.h[1]\n"
    "smlal v24.4s, v16.4h, v0.h[2]\n"
    "smlal v23.4s, v16.4h, v0.h[3]\n"
    "smlal v22.4s, v16.4h, v0.h[4]\n"
    "smlal v21.4s, v16.4h, v0.h[5]\n"
    "smlal v20.4s, v16.4h, v0.h[6]\n"
    "smlal v19.4s, v16.4h, v0.h[7]\n"
    "ldr d0, [x27, #0x0]\n"
    "ssubl v1.8h, v1.8b, v12.8b\n"
    "ldr s16, [%x[weights]], #0x4\n"
    "ssubl v0.8h, v0.8b, v12.8b\n"
    "ssubl v16.8h, v16.8b, v11.8b\n"
    "bgt 4b\n"
    "5:"  // Output channel loop: Kernel loop tail
    "tbnz %x[kernel_points], #0, 6f\n"
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "ldr x19, [%x[outptrs], #0x0]\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "ldr x20, [%x[outptrs], #0x8]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "ldr x21, [%x[outptrs], #0x10]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "ldr x22, [%x[outptrs], #0x18]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "ldr x23, [%x[outptrs], #0x20]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "ldr x24, [%x[outptrs], #0x28]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "ldr x25, [%x[outptrs], #0x30]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "ldr x26, [%x[outptrs], #0x38]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "smlal v6.4s, v16.4h, v1.h[0]\n"
    "smlal v5.4s, v16.4h, v1.h[1]\n"
    "smlal v4.4s, v16.4h, v1.h[2]\n"
    "smlal v31.4s, v16.4h, v1.h[3]\n"
    "smlal v30.4s, v16.4h, v1.h[4]\n"
    "smlal v29.4s, v16.4h, v1.h[5]\n"
    "smlal v28.4s, v16.4h, v1.h[6]\n"
    "smlal v27.4s, v16.4h, v1.h[7]\n"
    "smlal v26.4s, v16.4h, v0.h[0]\n"
    "smlal v25.4s, v16.4h, v0.h[1]\n"
    "smlal v24.4s, v16.4h, v0.h[2]\n"
    "smlal v23.4s, v16.4h, v0.h[3]\n"
    "smlal v22.4s, v16.4h, v0.h[4]\n"
    "smlal v21.4s, v16.4h, v0.h[5]\n"
    "smlal v20.4s, v16.4h, v0.h[6]\n"
    "smlal v19.4s, v16.4h, v0.h[7]\n"
    "sshl v6.4s, v6.4s, v9.4s\n"
    "sshl v5.4s, v5.4s, v9.4s\n"
    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
    "sshl v4.4s, v4.4s, v9.4s\n"
    "sshl v31.4s, v31.4s, v9.4s\n"
    "and v18.16b, v6.16b, v7.16b\n"
    "and v16.16b, v5.16b, v7.16b\n"
    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
    "sqadd v6.4s, v6.4s, v18.4s\n"
    "sqadd v5.4s, v5.4s, v16.4s\n"
    "and v17.16b, v4.16b, v7.16b\n"
    "and v16.16b, v31.16b, v7.16b\n"
    "srshl v6.4s, v6.4s, v7.4s\n"
    "srshl v5.4s, v5.4s, v7.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v6.4s, v6.4s, v10.4s\n"
    "add v5.4s, v5.4s, v10.4s\n"
    "sqadd v4.4s, v4.4s, v17.4s\n"
    "smin v6.4s, v6.4s, v13.4s\n"
    "smin v5.4s, v5.4s, v13.4s\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "smax v6.4s, v6.4s, v14.4s\n"
    "smax v5.4s, v5.4s, v14.4s\n"
    "srshl v4.4s, v4.4s, v7.4s\n"
    "uzp1 v6.16b, v6.16b, v6.16b\n"
    "uzp1 v5.16b, v5.16b, v5.16b\n"
    "uzp1 v6.16b, v6.16b, v6.16b\n"
    "str s6, [x19, x9]\n"
    "uzp1 v5.16b, v5.16b, v5.16b\n"
    "add v4.4s, v4.4s, v10.4s\n"
    "ldr x19, [%x[outptrs], #0x40]\n"
    "srshl v31.4s, v31.4s, v7.4s\n"
    "str s5, [x20, x9]\n"
    "sshl v30.4s, v30.4s, v9.4s\n"
    "ldr x20, [%x[outptrs], #0x48]\n"
    "smin v4.4s, v4.4s, v13.4s\n"
    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
    "add v31.4s, v31.4s, v10.4s\n"
    "smax v4.4s, v4.4s, v14.4s\n"
    "sshl v29.4s, v29.4s, v9.4s\n"
    "smin v31.4s, v31.4s, v13.4s\n"
    "uzp1 v4.16b, v4.16b, v4.16b\n"
    "and v16.16b, v30.16b, v7.16b\n"
    "uzp1 v4.16b, v4.16b, v4.16b\n"
    "str s4, [x21, x9]\n"
    "smax v31.4s, v31.4s, v14.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "ldr x21, [%x[outptrs], #0x50]\n"
    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
    "sshl v28.4s, v28.4s, v9.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "sqadd v30.4s, v30.4s, v16.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "str s31, [x22, x9]\n"
    "and v17.16b, v29.16b, v7.16b\n"
    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
    "ldr x22, [%x[outptrs], #0x58]\n"
    "srshl v30.4s, v30.4s, v7.4s\n"
    "sshl v27.4s, v27.4s, v9.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v28.16b, v7.16b\n"
    "add v30.4s, v30.4s, v10.4s\n"
    "sqadd v29.4s, v29.4s, v17.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smin v30.4s, v30.4s, v13.4s\n"
    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
    "srshl v29.4s, v29.4s, v7.4s\n"
    "smax v30.4s, v30.4s, v14.4s\n"
    "sqadd v28.4s, v28.4s, v16.4s\n"
    "and v16.16b, v27.16b, v7.16b\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "add v29.4s, v29.4s, v10.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "str s30, [x23, x9]\n"
    "smin v29.4s, v29.4s, v13.4s\n"
    "srshl v28.4s, v28.4s, v7.4s\n"
    "ldr x23, [%x[outptrs], #0x60]\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshl v26.4s, v26.4s, v9.4s\n"
    "smax v29.4s, v29.4s, v14.4s\n"
    "add v28.4s, v28.4s, v10.4s\n"
    "sqadd v27.4s, v27.4s, v16.4s\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "smin v28.4s, v28.4s, v13.4s\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "str s29, [x24, x9]\n"
    "smax v28.4s, v28.4s, v14.4s\n"
    "srshl v27.4s, v27.4s, v7.4s\n"
    "ldr x24, [%x[outptrs], #0x68]\n"
    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
    "sshl v25.4s, v25.4s, v9.4s\n"
    "uzp1 v28.16b, v28.16b, v28.16b\n"
    "add v27.4s, v27.4s, v10.4s\n"
    "uzp1 v28.16b, v28.16b, v28.16b\n"
    "str s28, [x25, x9]\n"
    "smin v27.4s, v27.4s, v13.4s\n"
    "and v17.16b, v26.16b, v7.16b\n"
    "ldr x25, [%x[outptrs], #0x70]\n"
    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
    "sshl v24.4s, v24.4s, v9.4s\n"
    "smax v27.4s, v27.4s, v14.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v25.16b, v7.16b\n"
    "uzp1 v27.16b, v27.16b, v27.16b\n"
    "sqadd v26.4s, v26.4s, v17.4s\n"
    "uzp1 v27.16b, v27.16b, v27.16b\n"
    "str s27, [x26, x9]\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
    "ldr x26, [%x[outptrs], #0x78]\n"
    "srshl v26.4s, v26.4s, v7.4s\n"
    "sshl v23.4s, v23.4s, v9.4s\n"
    "sqadd v25.4s, v25.4s, v16.4s\n"
    "and v17.16b, v24.16b, v7.16b\n"
    "add v26.4s, v26.4s, v10.4s\n"
    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
    "srshl v25.4s, v25.4s, v7.4s\n"
    "smin v26.4s, v26.4s, v13.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v23.16b, v7.16b\n"
    "smax v26.4s, v26.4s, v14.4s\n"
    "add v25.4s, v25.4s, v10.4s\n"
    "sqadd v24.4s, v24.4s, v17.4s\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "smin v25.4s, v25.4s, v13.4s\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "str s26, [x19, x9]\n"
    "smax v25.4s, v25.4s, v14.4s\n"
    "srshl v24.4s, v24.4s, v7.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshl v22.4s, v22.4s, v9.4s\n"
    "uzp1 v25.16b, v25.16b, v25.16b\n"
    "add v24.4s, v24.4s, v10.4s\n"
    "uzp1 v25.16b, v25.16b, v25.16b\n"
    "str s25, [x20, x9]\n"
    "smin v24.4s, v24.4s, v13.4s\n"
    "sqadd v23.4s, v23.4s, v16.4s\n"
    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
    "sshl v21.4s, v21.4s, v9.4s\n"
    "smax v24.4s, v24.4s, v14.4s\n"
    "srshl v23.4s, v23.4s, v7.4s\n"
    "and v17.16b, v22.16b, v7.16b\n"
    "uzp1 v24.16b, v24.16b, v24.16b\n"
    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
    "uzp1 v24.16b, v24.16b, v24.16b\n"
    "str s24, [x21, x9]\n"
    "add v23.4s, v23.4s, v10.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v21.16b, v7.16b\n"
    "sshl v20.4s, v20.4s, v9.4s\n"
    "smin v23.4s, v23.4s, v13.4s\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smax v23.4s, v23.4s, v14.4s\n"
    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
    "srshl v22.4s, v22.4s, v7.4s\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "sqadd v21.4s, v21.4s, v16.4s\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "str s23, [x22, x9]\n"
    "add v22.4s, v22.4s, v10.4s\n"
    "and v16.16b, v20.16b, v7.16b\n"
    "srshl v21.4s, v21.4s, v7.4s\n"
    "sshl v19.4s, v19.4s, v9.4s\n"
    "smin v22.4s, v22.4s, v13.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v21.4s, v21.4s, v10.4s\n"
    "smax v22.4s, v22.4s, v14.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "smin v21.4s, v21.4s, v13.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "str s22, [x23, x9]\n"
    "smax v21.4s, v21.4s, v14.4s\n"
    "srshl v20.4s, v20.4s, v7.4s\n"
    "and v16.16b, v19.16b, v7.16b\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "add v20.4s, v20.4s, v10.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "str s21, [x24, x9]\n"
    "smin v20.4s, v20.4s, v13.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "smax v20.4s, v20.4s, v14.4s\n"
    "srshl v19.4s, v19.4s, v7.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "str s20, [x25, x9]\n"
    "add v19.4s, v19.4s, v10.4s\n"
    "smin v19.4s, v19.4s, v13.4s\n"
    "smax v19.4s, v19.4s, v14.4s\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "str s19, [x26, x9]\n"
    "b 8f\n"
    "6:"  // Output channel loop: Odd tail
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "ldp x25, x27, [x19], #0x10\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "ldr x19, [%x[outptrs], #0x0]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "ldr x20, [%x[outptrs], #0x8]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "ldr x21, [%x[outptrs], #0x10]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "ldr x22, [%x[outptrs], #0x18]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "ldr x23, [%x[outptrs], #0x20]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "ldr x24, [%x[outptrs], #0x28]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "ldr d3, [x25, #0x0]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "ldr x25, [%x[outptrs], #0x30]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "ldr x26, [%x[outptrs], #0x38]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "ldr d2, [x27, #0x0]\n"
    "ssubl v3.8h, v3.8b, v12.8b\n"
    "ldr s17, [%x[weights]], #0x4\n"
    "smlal v6.4s, v16.4h, v1.h[0]\n"
    "smlal v5.4s, v16.4h, v1.h[1]\n"
    "smlal v4.4s, v16.4h, v1.h[2]\n"
    "ssubl v2.8h, v2.8b, v12.8b\n"
    "ssubl v17.8h, v17.8b, v11.8b\n"
    "smlal v31.4s, v16.4h, v1.h[3]\n"
    "smlal v30.4s, v16.4h, v1.h[4]\n"
    "smlal v29.4s, v16.4h, v1.h[5]\n"
    "smlal v28.4s, v16.4h, v1.h[6]\n"
    "smlal v27.4s, v16.4h, v1.h[7]\n"
    "smlal v26.4s, v16.4h, v0.h[0]\n"
    "smlal v25.4s, v16.4h, v0.h[1]\n"
    "smlal v24.4s, v16.4h, v0.h[2]\n"
    "smlal v23.4s, v16.4h, v0.h[3]\n"
    "smlal v22.4s, v16.4h, v0.h[4]\n"
    "smlal v21.4s, v16.4h, v0.h[5]\n"
    "smlal v20.4s, v16.4h, v0.h[6]\n"
    "smlal v19.4s, v16.4h, v0.h[7]\n"
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "sshl v6.4s, v6.4s, v9.4s\n"
    "sshl v5.4s, v5.4s, v9.4s\n"
    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
    "sshl v4.4s, v4.4s, v9.4s\n"
    "sshl v31.4s, v31.4s, v9.4s\n"
    "and v18.16b, v6.16b, v7.16b\n"
    "and v16.16b, v5.16b, v7.16b\n"
    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
    "sqadd v6.4s, v6.4s, v18.4s\n"
    "sqadd v5.4s, v5.4s, v16.4s\n"
    "and v17.16b, v4.16b, v7.16b\n"
    "and v16.16b, v31.16b, v7.16b\n"
    "srshl v6.4s, v6.4s, v7.4s\n"
    "srshl v5.4s, v5.4s, v7.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v6.4s, v6.4s, v10.4s\n"
    "add v5.4s, v5.4s, v10.4s\n"
    "sqadd v4.4s, v4.4s, v17.4s\n"
    "smin v6.4s, v6.4s, v13.4s\n"
    "smin v5.4s, v5.4s, v13.4s\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "smax v6.4s, v6.4s, v14.4s\n"
    "smax v5.4s, v5.4s, v14.4s\n"
    "srshl v4.4s, v4.4s, v7.4s\n"
    "uzp1 v6.16b, v6.16b, v6.16b\n"
    "uzp1 v5.16b, v5.16b, v5.16b\n"
    "uzp1 v6.16b, v6.16b, v6.16b\n"
    "str s6, [x19, x9]\n"
    "uzp1 v5.16b, v5.16b, v5.16b\n"
    "add v4.4s, v4.4s, v10.4s\n"
    "ldr x19, [%x[outptrs], #0x40]\n"
    "srshl v31.4s, v31.4s, v7.4s\n"
    "str s5, [x20, x9]\n"
    "sshl v30.4s, v30.4s, v9.4s\n"
    "ldr x20, [%x[outptrs], #0x48]\n"
    "smin v4.4s, v4.4s, v13.4s\n"
    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
    "add v31.4s, v31.4s, v10.4s\n"
    "smax v4.4s, v4.4s, v14.4s\n"
    "sshl v29.4s, v29.4s, v9.4s\n"
    "smin v31.4s, v31.4s, v13.4s\n"
    "uzp1 v4.16b, v4.16b, v4.16b\n"
    "and v16.16b, v30.16b, v7.16b\n"
    "uzp1 v4.16b, v4.16b, v4.16b\n"
    "str s4, [x21, x9]\n"
    "smax v31.4s, v31.4s, v14.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "ldr x21, [%x[outptrs], #0x50]\n"
    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
    "sshl v28.4s, v28.4s, v9.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "sqadd v30.4s, v30.4s, v16.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "str s31, [x22, x9]\n"
    "and v17.16b, v29.16b, v7.16b\n"
    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
    "ldr x22, [%x[outptrs], #0x58]\n"
    "srshl v30.4s, v30.4s, v7.4s\n"
    "sshl v27.4s, v27.4s, v9.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v28.16b, v7.16b\n"
    "add v30.4s, v30.4s, v10.4s\n"
    "sqadd v29.4s, v29.4s, v17.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smin v30.4s, v30.4s, v13.4s\n"
    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
    "srshl v29.4s, v29.4s, v7.4s\n"
    "smax v30.4s, v30.4s, v14.4s\n"
    "sqadd v28.4s, v28.4s, v16.4s\n"
    "and v16.16b, v27.16b, v7.16b\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "add v29.4s, v29.4s, v10.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "str s30, [x23, x9]\n"
    "smin v29.4s, v29.4s, v13.4s\n"
    "srshl v28.4s, v28.4s, v7.4s\n"
    "ldr x23, [%x[outptrs], #0x60]\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshl v26.4s, v26.4s, v9.4s\n"
    "smax v29.4s, v29.4s, v14.4s\n"
    "add v28.4s, v28.4s, v10.4s\n"
    "sqadd v27.4s, v27.4s, v16.4s\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "smin v28.4s, v28.4s, v13.4s\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "str s29, [x24, x9]\n"
    "smax v28.4s, v28.4s, v14.4s\n"
    "srshl v27.4s, v27.4s, v7.4s\n"
    "ldr x24, [%x[outptrs], #0x68]\n"
    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
    "sshl v25.4s, v25.4s, v9.4s\n"
    "uzp1 v28.16b, v28.16b, v28.16b\n"
    "add v27.4s, v27.4s, v10.4s\n"
    "uzp1 v28.16b, v28.16b, v28.16b\n"
    "str s28, [x25, x9]\n"
    "smin v27.4s, v27.4s, v13.4s\n"
    "and v17.16b, v26.16b, v7.16b\n"
    "ldr x25, [%x[outptrs], #0x70]\n"
    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
    "sshl v24.4s, v24.4s, v9.4s\n"
    "smax v27.4s, v27.4s, v14.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v25.16b, v7.16b\n"
    "uzp1 v27.16b, v27.16b, v27.16b\n"
    "sqadd v26.4s, v26.4s, v17.4s\n"
    "uzp1 v27.16b, v27.16b, v27.16b\n"
    "str s27, [x26, x9]\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
    "ldr x26, [%x[outptrs], #0x78]\n"
    "srshl v26.4s, v26.4s, v7.4s\n"
    "sshl v23.4s, v23.4s, v9.4s\n"
    "sqadd v25.4s, v25.4s, v16.4s\n"
    "and v17.16b, v24.16b, v7.16b\n"
    "add v26.4s, v26.4s, v10.4s\n"
    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
    "srshl v25.4s, v25.4s, v7.4s\n"
    "smin v26.4s, v26.4s, v13.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v23.16b, v7.16b\n"
    "smax v26.4s, v26.4s, v14.4s\n"
    "add v25.4s, v25.4s, v10.4s\n"
    "sqadd v24.4s, v24.4s, v17.4s\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "smin v25.4s, v25.4s, v13.4s\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "str s26, [x19, x9]\n"
    "smax v25.4s, v25.4s, v14.4s\n"
    "srshl v24.4s, v24.4s, v7.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshl v22.4s, v22.4s, v9.4s\n"
    "uzp1 v25.16b, v25.16b, v25.16b\n"
    "add v24.4s, v24.4s, v10.4s\n"
    "uzp1 v25.16b, v25.16b, v25.16b\n"
    "str s25, [x20, x9]\n"
    "smin v24.4s, v24.4s, v13.4s\n"
    "sqadd v23.4s, v23.4s, v16.4s\n"
    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
    "sshl v21.4s, v21.4s, v9.4s\n"
    "smax v24.4s, v24.4s, v14.4s\n"
    "srshl v23.4s, v23.4s, v7.4s\n"
    "and v17.16b, v22.16b, v7.16b\n"
    "uzp1 v24.16b, v24.16b, v24.16b\n"
    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
    "uzp1 v24.16b, v24.16b, v24.16b\n"
    "str s24, [x21, x9]\n"
    "add v23.4s, v23.4s, v10.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v21.16b, v7.16b\n"
    "sshl v20.4s, v20.4s, v9.4s\n"
    "smin v23.4s, v23.4s, v13.4s\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smax v23.4s, v23.4s, v14.4s\n"
    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
    "srshl v22.4s, v22.4s, v7.4s\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "sqadd v21.4s, v21.4s, v16.4s\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "str s23, [x22, x9]\n"
    "add v22.4s, v22.4s, v10.4s\n"
    "and v16.16b, v20.16b, v7.16b\n"
    "srshl v21.4s, v21.4s, v7.4s\n"
    "sshl v19.4s, v19.4s, v9.4s\n"
    "smin v22.4s, v22.4s, v13.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v21.4s, v21.4s, v10.4s\n"
    "smax v22.4s, v22.4s, v14.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "smin v21.4s, v21.4s, v13.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "str s22, [x23, x9]\n"
    "smax v21.4s, v21.4s, v14.4s\n"
    "srshl v20.4s, v20.4s, v7.4s\n"
    "and v16.16b, v19.16b, v7.16b\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "add v20.4s, v20.4s, v10.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "str s21, [x24, x9]\n"
    "smin v20.4s, v20.4s, v13.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "smax v20.4s, v20.4s, v14.4s\n"
    "srshl v19.4s, v19.4s, v7.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "str s20, [x25, x9]\n"
    "add v19.4s, v19.4s, v10.4s\n"
    "smin v19.4s, v19.4s, v13.4s\n"
    "smax v19.4s, v19.4s, v14.4s\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "str s19, [x26, x9]\n"
    "b 8f\n"
    "7:"  // Output channel loop: Single kernel point
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "ldr x19, [%x[outptrs], #0x0]\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "ldr x20, [%x[outptrs], #0x8]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "ldr x21, [%x[outptrs], #0x10]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "ldr x22, [%x[outptrs], #0x18]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "ldr x23, [%x[outptrs], #0x20]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "ldr x24, [%x[outptrs], #0x28]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "ldr x25, [%x[outptrs], #0x30]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "ldr x26, [%x[outptrs], #0x38]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "sshl v6.4s, v6.4s, v9.4s\n"
    "sshl v5.4s, v5.4s, v9.4s\n"
    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
    "sshl v4.4s, v4.4s, v9.4s\n"
    "sshl v31.4s, v31.4s, v9.4s\n"
    "and v18.16b, v6.16b, v7.16b\n"
    "and v16.16b, v5.16b, v7.16b\n"
    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
    "sqadd v6.4s, v6.4s, v18.4s\n"
    "sqadd v5.4s, v5.4s, v16.4s\n"
    "and v17.16b, v4.16b, v7.16b\n"
    "and v16.16b, v31.16b, v7.16b\n"
    "srshl v6.4s, v6.4s, v7.4s\n"
    "srshl v5.4s, v5.4s, v7.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v6.4s, v6.4s, v10.4s\n"
    "add v5.4s, v5.4s, v10.4s\n"
    "sqadd v4.4s, v4.4s, v17.4s\n"
    "smin v6.4s, v6.4s, v13.4s\n"
    "smin v5.4s, v5.4s, v13.4s\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "smax v6.4s, v6.4s, v14.4s\n"
    "smax v5.4s, v5.4s, v14.4s\n"
    "srshl v4.4s, v4.4s, v7.4s\n"
    "uzp1 v6.16b, v6.16b, v6.16b\n"
    "uzp1 v5.16b, v5.16b, v5.16b\n"
    "uzp1 v6.16b, v6.16b, v6.16b\n"
    "str s6, [x19, x9]\n"
    "uzp1 v5.16b, v5.16b, v5.16b\n"
    "add v4.4s, v4.4s, v10.4s\n"
    "ldr x19, [%x[outptrs], #0x40]\n"
    "srshl v31.4s, v31.4s, v7.4s\n"
    "str s5, [x20, x9]\n"
    "sshl v30.4s, v30.4s, v9.4s\n"
    "ldr x20, [%x[outptrs], #0x48]\n"
    "smin v4.4s, v4.4s, v13.4s\n"
    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
    "add v31.4s, v31.4s, v10.4s\n"
    "smax v4.4s, v4.4s, v14.4s\n"
    "sshl v29.4s, v29.4s, v9.4s\n"
    "smin v31.4s, v31.4s, v13.4s\n"
    "uzp1 v4.16b, v4.16b, v4.16b\n"
    "and v16.16b, v30.16b, v7.16b\n"
    "uzp1 v4.16b, v4.16b, v4.16b\n"
    "str s4, [x21, x9]\n"
    "smax v31.4s, v31.4s, v14.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "ldr x21, [%x[outptrs], #0x50]\n"
    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
    "sshl v28.4s, v28.4s, v9.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "sqadd v30.4s, v30.4s, v16.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "str s31, [x22, x9]\n"
    "and v17.16b, v29.16b, v7.16b\n"
    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
    "ldr x22, [%x[outptrs], #0x58]\n"
    "srshl v30.4s, v30.4s, v7.4s\n"
    "sshl v27.4s, v27.4s, v9.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v28.16b, v7.16b\n"
    "add v30.4s, v30.4s, v10.4s\n"
    "sqadd v29.4s, v29.4s, v17.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smin v30.4s, v30.4s, v13.4s\n"
    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
    "srshl v29.4s, v29.4s, v7.4s\n"
    "smax v30.4s, v30.4s, v14.4s\n"
    "sqadd v28.4s, v28.4s, v16.4s\n"
    "and v16.16b, v27.16b, v7.16b\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "add v29.4s, v29.4s, v10.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "str s30, [x23, x9]\n"
    "smin v29.4s, v29.4s, v13.4s\n"
    "srshl v28.4s, v28.4s, v7.4s\n"
    "ldr x23, [%x[outptrs], #0x60]\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshl v26.4s, v26.4s, v9.4s\n"
    "smax v29.4s, v29.4s, v14.4s\n"
    "add v28.4s, v28.4s, v10.4s\n"
    "sqadd v27.4s, v27.4s, v16.4s\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "smin v28.4s, v28.4s, v13.4s\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "str s29, [x24, x9]\n"
    "smax v28.4s, v28.4s, v14.4s\n"
    "srshl v27.4s, v27.4s, v7.4s\n"
    "ldr x24, [%x[outptrs], #0x68]\n"
    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
    "sshl v25.4s, v25.4s, v9.4s\n"
    "uzp1 v28.16b, v28.16b, v28.16b\n"
    "add v27.4s, v27.4s, v10.4s\n"
    "uzp1 v28.16b, v28.16b, v28.16b\n"
    "str s28, [x25, x9]\n"
    "smin v27.4s, v27.4s, v13.4s\n"
    "and v17.16b, v26.16b, v7.16b\n"
    "ldr x25, [%x[outptrs], #0x70]\n"
    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
    "sshl v24.4s, v24.4s, v9.4s\n"
    "smax v27.4s, v27.4s, v14.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v25.16b, v7.16b\n"
    "uzp1 v27.16b, v27.16b, v27.16b\n"
    "sqadd v26.4s, v26.4s, v17.4s\n"
    "uzp1 v27.16b, v27.16b, v27.16b\n"
    "str s27, [x26, x9]\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
    "ldr x26, [%x[outptrs], #0x78]\n"
    "srshl v26.4s, v26.4s, v7.4s\n"
    "sshl v23.4s, v23.4s, v9.4s\n"
    "sqadd v25.4s, v25.4s, v16.4s\n"
    "and v17.16b, v24.16b, v7.16b\n"
    "add v26.4s, v26.4s, v10.4s\n"
    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
    "srshl v25.4s, v25.4s, v7.4s\n"
    "smin v26.4s, v26.4s, v13.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v23.16b, v7.16b\n"
    "smax v26.4s, v26.4s, v14.4s\n"
    "add v25.4s, v25.4s, v10.4s\n"
    "sqadd v24.4s, v24.4s, v17.4s\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "smin v25.4s, v25.4s, v13.4s\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "str s26, [x19, x9]\n"
    "smax v25.4s, v25.4s, v14.4s\n"
    "srshl v24.4s, v24.4s, v7.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshl v22.4s, v22.4s, v9.4s\n"
    "uzp1 v25.16b, v25.16b, v25.16b\n"
    "add v24.4s, v24.4s, v10.4s\n"
    "uzp1 v25.16b, v25.16b, v25.16b\n"
    "str s25, [x20, x9]\n"
    "smin v24.4s, v24.4s, v13.4s\n"
    "sqadd v23.4s, v23.4s, v16.4s\n"
    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
    "sshl v21.4s, v21.4s, v9.4s\n"
    "smax v24.4s, v24.4s, v14.4s\n"
    "srshl v23.4s, v23.4s, v7.4s\n"
    "and v17.16b, v22.16b, v7.16b\n"
    "uzp1 v24.16b, v24.16b, v24.16b\n"
    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
    "uzp1 v24.16b, v24.16b, v24.16b\n"
    "str s24, [x21, x9]\n"
    "add v23.4s, v23.4s, v10.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "and v16.16b, v21.16b, v7.16b\n"
    "sshl v20.4s, v20.4s, v9.4s\n"
    "smin v23.4s, v23.4s, v13.4s\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smax v23.4s, v23.4s, v14.4s\n"
    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
    "srshl v22.4s, v22.4s, v7.4s\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "sqadd v21.4s, v21.4s, v16.4s\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "str s23, [x22, x9]\n"
    "add v22.4s, v22.4s, v10.4s\n"
    "and v16.16b, v20.16b, v7.16b\n"
    "srshl v21.4s, v21.4s, v7.4s\n"
    "sshl v19.4s, v19.4s, v9.4s\n"
    "smin v22.4s, v22.4s, v13.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v21.4s, v21.4s, v10.4s\n"
    "smax v22.4s, v22.4s, v14.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "smin v21.4s, v21.4s, v13.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "str s22, [x23, x9]\n"
    "smax v21.4s, v21.4s, v14.4s\n"
    "srshl v20.4s, v20.4s, v7.4s\n"
    "and v16.16b, v19.16b, v7.16b\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "add v20.4s, v20.4s, v10.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "str s21, [x24, x9]\n"
    "smin v20.4s, v20.4s, v13.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "smax v20.4s, v20.4s, v14.4s\n"
    "srshl v19.4s, v19.4s, v7.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "str s20, [x25, x9]\n"
    "add v19.4s, v19.4s, v10.4s\n"
    "smin v19.4s, v19.4s, v13.4s\n"
    "smax v19.4s, v19.4s, v14.4s\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "str s19, [x26, x9]\n"
    "8:"  // Output channel loop: Done
    "add x9, x9, #0x4\n"
    "cmp x9, x28, LSL #2\n"
    "blt 1b\n"
    "tst %x[n_output_channels], #0x3\n"
    "beq 26f\n"
    "9:"  // Output channel oddments
    "movi v16.4s, #0x0\n"
    "cbz %x[bias], 12f\n"
    "add x19, %x[bias], x9, LSL #2\n"
    "tbz %x[n_output_channels], #1, 10f\n"
    "ld1 { v16.d }[0], [x19], #0x8\n"
    "tbz %x[n_output_channels], #0, 11f\n"
    "ld1 { v16.s }[2], [x19]\n"
    "b 11f\n"
    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
    "tbz %x[n_output_channels], #0, 11f\n"
    "ld1 { v16.s }[0], [x19]\n"
    "11:"  // Output channel oddments: Load bias: Bit 1: End

    "12:"  // Output channel oddments: Load bias: Done
    "mov v6.16b, v16.16b\n"
    "mov v5.16b, v16.16b\n"
    "mov v4.16b, v16.16b\n"
    "mov v31.16b, v16.16b\n"
    "mov v30.16b, v16.16b\n"
    "mov v29.16b, v16.16b\n"
    "mov v28.16b, v16.16b\n"
    "mov v27.16b, v16.16b\n"
    "mov v26.16b, v16.16b\n"
    "mov v25.16b, v16.16b\n"
    "mov v24.16b, v16.16b\n"
    "mov v23.16b, v16.16b\n"
    "mov v22.16b, v16.16b\n"
    "mov v21.16b, v16.16b\n"
    "mov v20.16b, v16.16b\n"
    "mov v19.16b, v16.16b\n"
    "cbz %x[rq_mul_ptr], 18f\n"
    "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
    "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
    "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
    "cbz %x[rq_left_shift_ptr], 15f\n"
    "tbz %x[n_output_channels], #1, 13f\n"
    "ld1 { v8.d }[0], [x21], #0x8\n"
    "ld1 { v7.d }[0], [x20], #0x8\n"
    "ld1 { v9.d }[0], [x19], #0x8\n"
    "tbz %x[n_output_channels], #0, 14f\n"
    "ld1 { v8.s }[2], [x21], #0x4\n"
    "ld1 { v7.s }[2], [x20], #0x4\n"
    "ld1 { v9.s }[2], [x19], #0x4\n"
    "b 14f\n"
    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
    "tbz %x[n_output_channels], #0, 14f\n"
    "ld1 { v8.s }[0], [x21], #0x4\n"
    "ld1 { v7.s }[0], [x20], #0x4\n"
    "ld1 { v9.s }[0], [x19], #0x4\n"
    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
    "b 18f\n"
    "15:"  // Output channel oddments: Load quantization parameters: No left shift
    "tbz %x[n_output_channels], #1, 16f\n"
    "ld1 { v8.d }[0], [x21], #0x8\n"
    "ld1 { v7.d }[0], [x20], #0x8\n"
    "tbz %x[n_output_channels], #0, 17f\n"
    "ld1 { v8.s }[2], [x21], #0x4\n"
    "ld1 { v7.s }[2], [x20], #0x4\n"
    "b 17f\n"
    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
    "tbz %x[n_output_channels], #0, 17f\n"
    "ld1 { v8.s }[0], [x21], #0x4\n"
    "ld1 { v7.s }[0], [x20], #0x4\n"
    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End

    "18:"  // Output channel oddments: Load quantization parameters: Done
    "ldr s17, [%x[weights]], #0x4\n"
    "ssubl v17.8h, v17.8b, v11.8b\n"
    "mov x19, %x[inptrs]\n"
    "ldp x25, x27, [x19], #0x10\n"
    "lsr x20, %x[kernel_points], #0x1\n"
    "ldr d3, [x25, #0x0]\n"
    "ssubl v3.8h, v3.8b, v12.8b\n"
    "ldr d2, [x27, #0x0]\n"
    "ssubl v2.8h, v2.8b, v12.8b\n"
    "cbz x20, 22f\n"
    "ldp x25, x27, [x19], #0x10\n"
    "ldr s16, [%x[weights]], #0x4\n"
    "ssubl v16.8h, v16.8b, v11.8b\n"
    "ldr d1, [x25, #0x0]\n"
    "subs x20, x20, #0x1\n"
    "ssubl v1.8h, v1.8b, v12.8b\n"
    "ldr d0, [x27, #0x0]\n"
    "ssubl v0.8h, v0.8b, v12.8b\n"
    "beq 20f\n"
    "19:"  // Output channel oddments: Kernel loop
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "ldp x25, x27, [x19], #0x10\n"
    "subs x20, x20, #0x1\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "ldr d3, [x25, #0x0]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "ldr d2, [x27, #0x0]\n"
    "ssubl v3.8h, v3.8b, v12.8b\n"
    "ldr s17, [%x[weights]], #0x4\n"
    "smlal v6.4s, v16.4h, v1.h[0]\n"
    "ldp x25, x27, [x19], #0x10\n"
    "smlal v5.4s, v16.4h, v1.h[1]\n"
    "smlal v4.4s, v16.4h, v1.h[2]\n"
    "ssubl v2.8h, v2.8b, v12.8b\n"
    "ssubl v17.8h, v17.8b, v11.8b\n"
    "smlal v31.4s, v16.4h, v1.h[3]\n"
    "smlal v30.4s, v16.4h, v1.h[4]\n"
    "smlal v29.4s, v16.4h, v1.h[5]\n"
    "smlal v28.4s, v16.4h, v1.h[6]\n"
    "smlal v27.4s, v16.4h, v1.h[7]\n"
    "ldr d1, [x25, #0x0]\n"
    "smlal v26.4s, v16.4h, v0.h[0]\n"
    "smlal v25.4s, v16.4h, v0.h[1]\n"
    "smlal v24.4s, v16.4h, v0.h[2]\n"
    "smlal v23.4s, v16.4h, v0.h[3]\n"
    "smlal v22.4s, v16.4h, v0.h[4]\n"
    "smlal v21.4s, v16.4h, v0.h[5]\n"
    "smlal v20.4s, v16.4h, v0.h[6]\n"
    "smlal v19.4s, v16.4h, v0.h[7]\n"
    "ldr d0, [x27, #0x0]\n"
    "ssubl v1.8h, v1.8b, v12.8b\n"
    "ldr s16, [%x[weights]], #0x4\n"
    "ssubl v0.8h, v0.8b, v12.8b\n"
    "ssubl v16.8h, v16.8b, v11.8b\n"
    "bgt 19b\n"
    "20:"  // Output channel oddments: Kernel loop tail
    "tbnz %x[kernel_points], #0, 21f\n"
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "smlal v6.4s, v16.4h, v1.h[0]\n"
    "smlal v5.4s, v16.4h, v1.h[1]\n"
    "smlal v4.4s, v16.4h, v1.h[2]\n"
    "smlal v31.4s, v16.4h, v1.h[3]\n"
    "smlal v30.4s, v16.4h, v1.h[4]\n"
    "smlal v29.4s, v16.4h, v1.h[5]\n"
    "smlal v28.4s, v16.4h, v1.h[6]\n"
    "smlal v27.4s, v16.4h, v1.h[7]\n"
    "smlal v26.4s, v16.4h, v0.h[0]\n"
    "smlal v25.4s, v16.4h, v0.h[1]\n"
    "smlal v24.4s, v16.4h, v0.h[2]\n"
    "smlal v23.4s, v16.4h, v0.h[3]\n"
    "smlal v22.4s, v16.4h, v0.h[4]\n"
    "smlal v21.4s, v16.4h, v0.h[5]\n"
    "smlal v20.4s, v16.4h, v0.h[6]\n"
    "smlal v19.4s, v16.4h, v0.h[7]\n"
    "b 23f\n"
    "21:"  // Output channel oddments: Odd tail
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "ldp x25, x27, [x19], #0x10\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "ldr d3, [x25, #0x0]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "ldr d2, [x27, #0x0]\n"
    "ssubl v3.8h, v3.8b, v12.8b\n"
    "ldr s17, [%x[weights]], #0x4\n"
    "smlal v6.4s, v16.4h, v1.h[0]\n"
    "smlal v5.4s, v16.4h, v1.h[1]\n"
    "smlal v4.4s, v16.4h, v1.h[2]\n"
    "ssubl v2.8h, v2.8b, v12.8b\n"
    "ssubl v17.8h, v17.8b, v11.8b\n"
    "smlal v31.4s, v16.4h, v1.h[3]\n"
    "smlal v30.4s, v16.4h, v1.h[4]\n"
    "smlal v29.4s, v16.4h, v1.h[5]\n"
    "smlal v28.4s, v16.4h, v1.h[6]\n"
    "smlal v27.4s, v16.4h, v1.h[7]\n"
    "smlal v26.4s, v16.4h, v0.h[0]\n"
    "smlal v25.4s, v16.4h, v0.h[1]\n"
    "smlal v24.4s, v16.4h, v0.h[2]\n"
    "smlal v23.4s, v16.4h, v0.h[3]\n"
    "smlal v22.4s, v16.4h, v0.h[4]\n"
    "smlal v21.4s, v16.4h, v0.h[5]\n"
    "smlal v20.4s, v16.4h, v0.h[6]\n"
    "smlal v19.4s, v16.4h, v0.h[7]\n"
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "b 23f\n"
    "22:"  // Output channel oddments: Single kernel point
    "smlal v6.4s, v17.4h, v3.h[0]\n"
    "smlal v5.4s, v17.4h, v3.h[1]\n"
    "smlal v4.4s, v17.4h, v3.h[2]\n"
    "smlal v31.4s, v17.4h, v3.h[3]\n"
    "smlal v30.4s, v17.4h, v3.h[4]\n"
    "smlal v29.4s, v17.4h, v3.h[5]\n"
    "smlal v28.4s, v17.4h, v3.h[6]\n"
    "smlal v27.4s, v17.4h, v3.h[7]\n"
    "smlal v26.4s, v17.4h, v2.h[0]\n"
    "smlal v25.4s, v17.4h, v2.h[1]\n"
    "smlal v24.4s, v17.4h, v2.h[2]\n"
    "smlal v23.4s, v17.4h, v2.h[3]\n"
    "smlal v22.4s, v17.4h, v2.h[4]\n"
    "smlal v21.4s, v17.4h, v2.h[5]\n"
    "smlal v20.4s, v17.4h, v2.h[6]\n"
    "smlal v19.4s, v17.4h, v2.h[7]\n"
    "23:"  // Output channel oddments: Done
    "sshl v6.4s, v6.4s, v9.4s\n"
    "sshl v5.4s, v5.4s, v9.4s\n"
    "sshl v4.4s, v4.4s, v9.4s\n"
    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
    "sshl v31.4s, v31.4s, v9.4s\n"
    "and v18.16b, v6.16b, v7.16b\n"
    "and v16.16b, v5.16b, v7.16b\n"
    "and v17.16b, v4.16b, v7.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sqadd v6.4s, v6.4s, v18.4s\n"
    "sqadd v5.4s, v5.4s, v16.4s\n"
    "sqadd v4.4s, v4.4s, v17.4s\n"
    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
    "srshl v6.4s, v6.4s, v7.4s\n"
    "srshl v5.4s, v5.4s, v7.4s\n"
    "srshl v4.4s, v4.4s, v7.4s\n"
    "and v16.16b, v31.16b, v7.16b\n"
    "add v6.4s, v6.4s, v10.4s\n"
    "add v5.4s, v5.4s, v10.4s\n"
    "add v4.4s, v4.4s, v10.4s\n"
    "smin v6.4s, v6.4s, v13.4s\n"
    "smin v5.4s, v5.4s, v13.4s\n"
    "smin v4.4s, v4.4s, v13.4s\n"
    "smax v6.4s, v6.4s, v14.4s\n"
    "smax v5.4s, v5.4s, v14.4s\n"
    "smax v4.4s, v4.4s, v14.4s\n"
    "uzp1 v6.16b, v6.16b, v6.16b\n"
    "uzp1 v5.16b, v5.16b, v5.16b\n"
    "uzp1 v6.16b, v6.16b, v6.16b\n"
    "uzp1 v5.16b, v5.16b, v5.16b\n"
    "uzp1 v4.16b, v4.16b, v4.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "uzp1 v4.16b, v4.16b, v4.16b\n"
    "sshl v30.4s, v30.4s, v9.4s\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
    "sshl v29.4s, v29.4s, v9.4s\n"
    "sshl v28.4s, v28.4s, v9.4s\n"
    "srshl v31.4s, v31.4s, v7.4s\n"
    "and v16.16b, v30.16b, v7.16b\n"
    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
    "add v31.4s, v31.4s, v10.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "and v17.16b, v29.16b, v7.16b\n"
    "smin v31.4s, v31.4s, v13.4s\n"
    "sqadd v30.4s, v30.4s, v16.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "smax v31.4s, v31.4s, v14.4s\n"
    "and v16.16b, v28.16b, v7.16b\n"
    "srshl v30.4s, v30.4s, v7.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "sqadd v29.4s, v29.4s, v17.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "add v30.4s, v30.4s, v10.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "srshl v29.4s, v29.4s, v7.4s\n"
    "smin v30.4s, v30.4s, v13.4s\n"
    "sqadd v28.4s, v28.4s, v16.4s\n"
    "sshl v27.4s, v27.4s, v9.4s\n"
    "smax v30.4s, v30.4s, v14.4s\n"
    "add v29.4s, v29.4s, v10.4s\n"
    "srshl v28.4s, v28.4s, v7.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "smin v29.4s, v29.4s, v13.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "add v28.4s, v28.4s, v10.4s\n"
    "smax v29.4s, v29.4s, v14.4s\n"
    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
    "smin v28.4s, v28.4s, v13.4s\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "sshl v26.4s, v26.4s, v9.4s\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "smax v28.4s, v28.4s, v14.4s\n"
    "and v16.16b, v27.16b, v7.16b\n"
    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
    "uzp1 v28.16b, v28.16b, v28.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "uzp1 v28.16b, v28.16b, v28.16b\n"
    "and v17.16b, v26.16b, v7.16b\n"
    "sqadd v27.4s, v27.4s, v16.4s\n"
    "sshl v25.4s, v25.4s, v9.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
    "srshl v27.4s, v27.4s, v7.4s\n"
    "sqadd v26.4s, v26.4s, v17.4s\n"
    "sshl v24.4s, v24.4s, v9.4s\n"
    "and v16.16b, v25.16b, v7.16b\n"
    "add v27.4s, v27.4s, v10.4s\n"
    "srshl v26.4s, v26.4s, v7.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smin v27.4s, v27.4s, v13.4s\n"
    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
    "add v26.4s, v26.4s, v10.4s\n"
    "smax v27.4s, v27.4s, v14.4s\n"
    "sqadd v25.4s, v25.4s, v16.4s\n"
    "smin v26.4s, v26.4s, v13.4s\n"
    "uzp1 v27.16b, v27.16b, v27.16b\n"
    "and v17.16b, v24.16b, v7.16b\n"
    "uzp1 v27.16b, v27.16b, v27.16b\n"
    "smax v26.4s, v26.4s, v14.4s\n"
    "srshl v25.4s, v25.4s, v7.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "sshl v23.4s, v23.4s, v9.4s\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "add v25.4s, v25.4s, v10.4s\n"
    "sqadd v24.4s, v24.4s, v17.4s\n"
    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
    "smin v25.4s, v25.4s, v13.4s\n"
    "sshl v22.4s, v22.4s, v9.4s\n"
    "srshl v24.4s, v24.4s, v7.4s\n"
    "smax v25.4s, v25.4s, v14.4s\n"
    "and v16.16b, v23.16b, v7.16b\n"
    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
    "uzp1 v25.16b, v25.16b, v25.16b\n"
    "add v24.4s, v24.4s, v10.4s\n"
    "uzp1 v25.16b, v25.16b, v25.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smin v24.4s, v24.4s, v13.4s\n"
    "and v17.16b, v22.16b, v7.16b\n"
    "sqadd v23.4s, v23.4s, v16.4s\n"
    "smax v24.4s, v24.4s, v14.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sshl v21.4s, v21.4s, v9.4s\n"
    "uzp1 v24.16b, v24.16b, v24.16b\n"
    "srshl v23.4s, v23.4s, v7.4s\n"
    "uzp1 v24.16b, v24.16b, v24.16b\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
    "add v23.4s, v23.4s, v10.4s\n"
    "sshl v20.4s, v20.4s, v9.4s\n"
    "srshl v22.4s, v22.4s, v7.4s\n"
    "smin v23.4s, v23.4s, v13.4s\n"
    "and v16.16b, v21.16b, v7.16b\n"
    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
    "smax v23.4s, v23.4s, v14.4s\n"
    "add v22.4s, v22.4s, v10.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "smin v22.4s, v22.4s, v13.4s\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "sqadd v21.4s, v21.4s, v16.4s\n"
    "smax v22.4s, v22.4s, v14.4s\n"
    "and v16.16b, v20.16b, v7.16b\n"
    "sshl v19.4s, v19.4s, v9.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "srshl v21.4s, v21.4s, v7.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
    "add v21.4s, v21.4s, v10.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "smin v21.4s, v21.4s, v13.4s\n"
    "and v16.16b, v19.16b, v7.16b\n"
    "srshl v20.4s, v20.4s, v7.4s\n"
    "smax v21.4s, v21.4s, v14.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "add v20.4s, v20.4s, v10.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "smin v20.4s, v20.4s, v13.4s\n"
    "srshl v19.4s, v19.4s, v7.4s\n"
    "smax v20.4s, v20.4s, v14.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "add v19.4s, v19.4s, v10.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smin v19.4s, v19.4s, v13.4s\n"
    "smax v19.4s, v19.4s, v14.4s\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "tbz %x[n_output_channels], #1, 24f\n"
    "ldr x19, [%x[outptrs], #0x0]\n"
    "ldr x20, [%x[outptrs], #0x8]\n"
    "add x19, x19, x9\n"
    "ldr x21, [%x[outptrs], #0x10]\n"
    "ldr x22, [%x[outptrs], #0x18]\n"
    "add x20, x20, x9\n"
    "st1 { v6.h }[0], [x19]\n"
    "add x21, x21, x9\n"
    "st1 { v5.h }[0], [x20]\n"
    "ldr x23, [%x[outptrs], #0x20]\n"
    "add x22, x22, x9\n"
    "st1 { v4.h }[0], [x21]\n"
    "add x23, x23, x9\n"
    "st1 { v31.h }[0], [x22]\n"
    "ldr x24, [%x[outptrs], #0x28]\n"
    "add x24, x24, x9\n"
    "st1 { v30.h }[0], [x23]\n"
    "ldr x25, [%x[outptrs], #0x30]\n"
    "add x25, x25, x9\n"
    "st1 { v29.h }[0], [x24]\n"
    "ldr x26, [%x[outptrs], #0x38]\n"
    "add x26, x26, x9\n"
    "st1 { v28.h }[0], [x25]\n"
    "ldr x19, [%x[outptrs], #0x40]\n"
    "add x19, x19, x9\n"
    "st1 { v27.h }[0], [x26]\n"
    "ldr x20, [%x[outptrs], #0x48]\n"
    "add x20, x20, x9\n"
    "st1 { v26.h }[0], [x19]\n"
    "ldr x21, [%x[outptrs], #0x50]\n"
    "add x21, x21, x9\n"
    "st1 { v25.h }[0], [x20]\n"
    "ldr x22, [%x[outptrs], #0x58]\n"
    "add x22, x22, x9\n"
    "st1 { v24.h }[0], [x21]\n"
    "ldr x23, [%x[outptrs], #0x60]\n"
    "add x23, x23, x9\n"
    "st1 { v23.h }[0], [x22]\n"
    "ldr x24, [%x[outptrs], #0x68]\n"
    "add x24, x24, x9\n"
    "st1 { v22.h }[0], [x23]\n"
    "ldr x25, [%x[outptrs], #0x70]\n"
    "add x25, x25, x9\n"
    "st1 { v21.h }[0], [x24]\n"
    "ldr x26, [%x[outptrs], #0x78]\n"
    "add x26, x26, x9\n"
    "st1 { v20.h }[0], [x25]\n"
    "add x9, x9, #0x2\n"
    "st1 { v19.h }[0], [x26]\n"
    "tbz %x[n_output_channels], #0, 25f\n"
    "ldr x19, [%x[outptrs], #0x0]\n"
    "ldr x20, [%x[outptrs], #0x8]\n"
    "add x19, x19, x9\n"
    "ldr x21, [%x[outptrs], #0x10]\n"
    "ldr x22, [%x[outptrs], #0x18]\n"
    "add x20, x20, x9\n"
    "st1 { v6.b }[2], [x19]\n"
    "add x21, x21, x9\n"
    "st1 { v5.b }[2], [x20]\n"
    "ldr x23, [%x[outptrs], #0x20]\n"
    "add x22, x22, x9\n"
    "st1 { v4.b }[2], [x21]\n"
    "add x23, x23, x9\n"
    "st1 { v31.b }[2], [x22]\n"
    "ldr x24, [%x[outptrs], #0x28]\n"
    "add x24, x24, x9\n"
    "st1 { v30.b }[2], [x23]\n"
    "ldr x25, [%x[outptrs], #0x30]\n"
    "add x25, x25, x9\n"
    "st1 { v29.b }[2], [x24]\n"
    "ldr x26, [%x[outptrs], #0x38]\n"
    "add x26, x26, x9\n"
    "st1 { v28.b }[2], [x25]\n"
    "ldr x19, [%x[outptrs], #0x40]\n"
    "add x19, x19, x9\n"
    "st1 { v27.b }[2], [x26]\n"
    "ldr x20, [%x[outptrs], #0x48]\n"
    "add x20, x20, x9\n"
    "st1 { v26.b }[2], [x19]\n"
    "ldr x21, [%x[outptrs], #0x50]\n"
    "add x21, x21, x9\n"
    "st1 { v25.b }[2], [x20]\n"
    "ldr x22, [%x[outptrs], #0x58]\n"
    "add x22, x22, x9\n"
    "st1 { v24.b }[2], [x21]\n"
    "ldr x23, [%x[outptrs], #0x60]\n"
    "add x23, x23, x9\n"
    "st1 { v23.b }[2], [x22]\n"
    "ldr x24, [%x[outptrs], #0x68]\n"
    "add x24, x24, x9\n"
    "st1 { v22.b }[2], [x23]\n"
    "ldr x25, [%x[outptrs], #0x70]\n"
    "add x25, x25, x9\n"
    "st1 { v21.b }[2], [x24]\n"
    "ldr x26, [%x[outptrs], #0x78]\n"
    "add x26, x26, x9\n"
    "st1 { v20.b }[2], [x25]\n"
    "st1 { v19.b }[2], [x26]\n"
    "b 25f\n"
    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
    "tbz %x[n_output_channels], #0, 25f\n"
    "ldr x19, [%x[outptrs], #0x0]\n"
    "ldr x20, [%x[outptrs], #0x8]\n"
    "add x19, x19, x9\n"
    "ldr x21, [%x[outptrs], #0x10]\n"
    "ldr x22, [%x[outptrs], #0x18]\n"
    "add x20, x20, x9\n"
    "st1 { v6.b }[0], [x19]\n"
    "add x21, x21, x9\n"
    "st1 { v5.b }[0], [x20]\n"
    "ldr x23, [%x[outptrs], #0x20]\n"
    "add x22, x22, x9\n"
    "st1 { v4.b }[0], [x21]\n"
    "add x23, x23, x9\n"
    "st1 { v31.b }[0], [x22]\n"
    "ldr x24, [%x[outptrs], #0x28]\n"
    "add x24, x24, x9\n"
    "st1 { v30.b }[0], [x23]\n"
    "ldr x25, [%x[outptrs], #0x30]\n"
    "add x25, x25, x9\n"
    "st1 { v29.b }[0], [x24]\n"
    "ldr x26, [%x[outptrs], #0x38]\n"
    "add x26, x26, x9\n"
    "st1 { v28.b }[0], [x25]\n"
    "ldr x19, [%x[outptrs], #0x40]\n"
    "add x19, x19, x9\n"
    "st1 { v27.b }[0], [x26]\n"
    "ldr x20, [%x[outptrs], #0x48]\n"
    "add x20, x20, x9\n"
    "st1 { v26.b }[0], [x19]\n"
    "ldr x21, [%x[outptrs], #0x50]\n"
    "add x21, x21, x9\n"
    "st1 { v25.b }[0], [x20]\n"
    "ldr x22, [%x[outptrs], #0x58]\n"
    "add x22, x22, x9\n"
    "st1 { v24.b }[0], [x21]\n"
    "ldr x23, [%x[outptrs], #0x60]\n"
    "add x23, x23, x9\n"
    "st1 { v23.b }[0], [x22]\n"
    "ldr x24, [%x[outptrs], #0x68]\n"
    "add x24, x24, x9\n"
    "st1 { v22.b }[0], [x23]\n"
    "ldr x25, [%x[outptrs], #0x70]\n"
    "add x25, x25, x9\n"
    "st1 { v21.b }[0], [x24]\n"
    "ldr x26, [%x[outptrs], #0x78]\n"
    "add x26, x26, x9\n"
    "st1 { v20.b }[0], [x25]\n"
    "st1 { v19.b }[0], [x26]\n"
    "25:"  // Output channel oddments: Done: Store: Bit 1: End

    "26:"  // Done

    : [weights] "+&r" (weights)
    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
  );
}

}  // namespace depthwise
}  // namespace arm_conv
#endif // defined(__aarch64__)
